{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "\n",
    "class MyCrawler:\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "        self.headers = {\n",
    "            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'\n",
    "        }\n",
    "\n",
    "    def download(self, url: str):\n",
    "        r = requests.get(url, headers=self.headers)\n",
    "        return r.text\n",
    "\n",
    "    def extract(self, content, pattern):\n",
    "        res = re.findall(pattern, content)\n",
    "        return res\n",
    "\n",
    "    def save(self, info):\n",
    "        with open(self.filename, 'a', encoding='utf-8') as f:\n",
    "            for item in info:\n",
    "                f.write(''.join(item) + '\\n') \n",
    "\n",
    "    def crawl(self, url: str, pattern: str, headers=None):\n",
    "        if headers:\n",
    "            self.headers.update(headers)\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dou_crawler = MyCrawler('douban.txt')\n",
    "\n",
    "url = 'https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C'\n",
    "\n",
    "headers = {\n",
    "    'Connection': 'keep-alive',\n",
    "    'Cache-Control': 'max-age=0',\n",
    "    'Upgrade-Insecure-Requests': '1',\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36',\n",
    "    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',\n",
    "    'Sec-Fetch-Site': 'same-origin',\n",
    "    'Sec-Fetch-Mode': 'navigate',\n",
    "    'Sec-Fetch-User': '?1',\n",
    "    'Sec-Fetch-Dest': 'document',\n",
    "    'Referer': 'https://book.douban.com/',\n",
    "    'Accept-Language': 'zh-CN,zh;q=0.9',\n",
    "}\n",
    "\n",
    "content = dou_crawler.download(url)\n",
    "assert('Neural Networks and Deep Learning' in content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "[('https://img1.doubanio.com/view/subject/s/public/s33631858',\n  'https://book.douban.com/subject/35044046/',\n  '神经网络与深度学习',\n  '邱锡鹏 / 机械工业出版社 / 2020-4-10 / 149.00元',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">9.4</span>\\n\\n    <span class=\"pl\">\\n        (162人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s29738046',\n  'https://book.douban.com/subject/30192800/',\n  'Python神经网络编程',\n  '[英]塔里克·拉希德（Tariq Rashid） / 林赐 / 人民邮电出版社 / 2018-4 / 69.00元',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">9.2</span>\\n\\n    <span class=\"pl\">\\n        (459人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s29839337',\n  'https://book.douban.com/subject/30293801/',\n  'Python深度学习',\n  '[美] 弗朗索瓦•肖莱 / 张亮 / 人民邮电出版社 / 2018-8 / 119.00元',\n  '<span class=\"allstar50\"></span>\\n        <span class=\"rating_nums\">9.6</span>\\n\\n    <span class=\"pl\">\\n        (595人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s29815955',\n  'https://book.douban.com/subject/30270959/',\n  '深度学习入门',\n  '[ 日］  斋藤康毅 / 陆宇杰 / 人民邮电出版社 / 2018-7 / 59.00元',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">9.4</span>\\n\\n    <span class=\"pl\">\\n        (514人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s32295077',\n  'https://book.douban.com/subject/33414479/',\n  '深度学习的数学',\n  '[日]涌井良幸、[日]涌井贞美 / 杨瑞龙 / 人民邮电出版社 / 2019-4 / 69.00元',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">8.8</span>\\n\\n    <span class=\"pl\">\\n        (118人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s33545334',\n  'https://book.douban.com/subject/34927262/',\n  '深入浅出图神经网络：GNN原理解析',\n  '刘忠雨\\u3000李彦霖\\u3000周洋\\u3000著 / 机械工业出版社 / 2019-12-25 / 89元',\n  '<span class=\"allstar25\"></span>\\n        <span class=\"rating_nums\">5.2</span>\\n\\n    <span class=\"pl\">\\n        (42人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s33557648',\n  'https://book.douban.com/subject/34941715/',\n  '数字思维',\n  '[葡] 阿林多•奥利维拉 / 胡小锐 / 中信出版社 / 2020-1-1 / 69.00',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">8.8</span>\\n\\n    <span class=\"pl\">\\n        (15人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s29936638',\n  'https://book.douban.com/subject/30236893/',\n  '神经网络设计（原书第2版）',\n  'Martin T. Hagan、Howard B. Demuth、Mark H. Beale / 章毅 / 机械工业出版社 / 2017-11 / 99.00元',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">8.6</span>\\n\\n    <span class=\"pl\">\\n        (18人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s28342396',\n  'https://book.douban.com/subject/26666358/',\n  '连接组：造就独一无二的你',\n  '[美] 承现峻 / 孙天齐 / 清华大学出版社 / 2016-1 / 45',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">8.5</span>\\n\\n    <span class=\"pl\">\\n        (290人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s28855545',\n  'https://book.douban.com/subject/26727997/',\n  'Neural Networks and Deep Learning',\n  'Michael Nielsen / 2016-1',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">9.4</span>\\n\\n    <span class=\"pl\">\\n        (203人评价)\\n    </span>'),\n ('https://img3.doubanio.com/view/subject/s/public/s4410591',\n  'https://book.douban.com/subject/4146246/',\n  '神经网络在应用科学和工程中的应用',\n  '萨马拉辛荷 / 2010-1 / 88.00元',\n  '<span class=\"pl\">\\n        (少于10人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s29877486',\n  'https://book.douban.com/subject/30333961/',\n  '图解深度学习与神经网络：从张量到TensorFlow实现',\n  '张平 / 电子工业出版社 / 2018-10 / 79.00元',\n  '<span class=\"pl\">\\n        (少于10人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s1663944',\n  'https://book.douban.com/subject/1159821/',\n  '意识的宇宙',\n  '[美] 杰拉尔德·埃德尔曼、[美] 朱利欧·托诺尼 / 顾凡及 / 上海科学技术出版社 / 2004-1 / 27.00元',\n  '<span class=\"allstar40\"></span>\\n        <span class=\"rating_nums\">8.3</span>\\n\\n    <span class=\"pl\">\\n        (193人评价)\\n    </span>'),\n ('https://img3.doubanio.com/view/subject/s/public/s29249951',\n  'https://book.douban.com/subject/26945232/',\n  'Make Your Own Neural Network',\n  'Tariq Rashid / CreateSpace Independent Publishing Platform / 2016-3-31 / USD 45.00',\n  '<span class=\"allstar50\"></span>\\n        <span class=\"rating_nums\">9.6</span>\\n\\n    <span class=\"pl\">\\n        (55人评价)\\n    </span>'),\n ('https://img9.doubanio.com/view/subject/s/public/s1695376',\n  'https://book.douban.com/subject/1138922/',\n  '神经网络原理(原书第2版)',\n  'Simon Haykin / 叶世伟、史忠植 / 机械工业出版社 / 2004-1 / 69.00元',\n  '<span class=\"allstar35\"></span>\\n        <span class=\"rating_nums\">7.3</span>\\n\\n    <span class=\"pl\">\\n        (54人评价)\\n    </span>'),\n ('https://img3.doubanio.com/view/subject/s/public/s28910563',\n  'https://book.douban.com/subject/26840215/',\n  'Hands-On Machine Learning with Scikit-Learn and TensorFlow',\n  'Aurélien Géron / O&#39;Reilly Media / 2017-1-25 / USD 49.99',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">9.2</span>\\n\\n    <span class=\"pl\">\\n        (400人评价)\\n    </span>'),\n ('https://img3.doubanio.com/view/subject/s/public/s3898822',\n  'https://book.douban.com/subject/2584657/',\n  'Neural Networks and Learning Machines',\n  'Simon O. Haykin / Pearson / 2008-11-28 / USD 252.40',\n  '<span class=\"allstar45\"></span>\\n        <span class=\"rating_nums\">8.7</span>\\n\\n    <span class=\"pl\">\\n        (53人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s6458908',\n  'https://book.douban.com/subject/3890040/',\n  '神经网络控制',\n  '徐丽娜 / 2009-7 / 28.00元',\n  '<span class=\"pl\">\\n        (少于10人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s28107307',\n  'https://book.douban.com/subject/26422529/',\n  'Neural Networks and Statistical Learning',\n  'Ke-Lin Du、M. N. S. Swamy / Springer / 2013-12-7 / USD 129.00',\n  '<span class=\"pl\">\\n        (少于10人评价)\\n    </span>'),\n ('https://img1.doubanio.com/view/subject/s/public/s10403219',\n  'https://book.douban.com/subject/3349636/',\n  '精通Visual C++指纹模式识别系统算法及实现',\n  '2008-12 / 59.00元',\n  '<span class=\"pl\">\\n        (少于10人评价)\\n    </span>')]"
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "re.findall (\n",
    "    'src=\"(.*?).jpg\"[\\S\\s]*?<a href=\"(.*?)\"\\stitle=\"(.*?)\"[\\S\\s]*?<div\\sclass=\"pub\">\\s*(.*?)\\s*<\\/div>[\\S\\s]*?<div\\sclass=\"star\\sclearfix\">\\s*([\\S\\s]*?)\\s*<\\/div>',content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyDoubanCrawler(MyCrawler):\n",
    "    def extract(self, content, pattern_main, pattern_star):\n",
    "        res = re.findall(pattern_main, content)\n",
    "        for index in range(len(res)):\n",
    "            if 'allstar' in res[index][4]:\n",
    "                items = re.findall(pattern_star, res[index][4])\n",
    "            else:\n",
    "                items = [['0', '0', '0']]\n",
    "            res[index] = list(res[index])\n",
    "            del res[index][4]\n",
    "            res[index].extend(items[0])\n",
    "        return res\n",
    "            \n",
    "    def crawl(self, url, pattern_main, pattern_star, headers=None):\n",
    "        if headers:\n",
    "            self.headers.update(headers)\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern_main, pattern_star)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dou_crawler = MyDoubanCrawler('douban.txt')\n",
    "\n",
    "dou_crawler.crawl(\n",
    "    'https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C',\n",
    "    'src=\"(.*?).jpg\"[\\S\\s]*?<a href=\"(.*?)\"\\stitle=\"(.*?)\"[\\S\\s]*?<div\\sclass=\"pub\">\\s*(.*?)\\s*<\\/div>[\\S\\s]*?<div\\sclass=\"star\\sclearfix\">\\s*([\\S\\s]*?)\\s*<\\/div>',\n",
    "    '<span class=\"allstar(\\d+)\".*?[\\S\\s]*?rating_nums\">(.*?)<\\/span>[\\S\\s]*?\\((\\d+)',\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}